# Kernel: microbench
# InsCnt: 18
# RegCnt: 5
# SharedSize: 4096
# BarCnt: 1
# Params(3):
#   ord:addr:size:align
#   0:0x140:4:0
#   1:0x144:4:0
#   2:0x148:4:0

// This is a simple micro bench to demonstrate the latency in loading SR_TID.X

//blockDim.x c[0x0][0x08]
//blockDim.y c[0x0][0x0c]
//blockDim.z c[0x0][0x10]
//gridDim.x  c[0x0][0x14]
//gridDim.y  c[0x0][0x18]
//gridDim.z  c[0x0][0x1c]


<REGISTER_MAPPING>

    0a-20 : out, clocks, in, tid, bid, blockDim, clock1, clock2, result, x

</REGISTER_MAPPING>

// Load in our params
--:-:-:-:1      MOV out,      c[0x0][0x140];
--:-:-:-:1      MOV clocks,   c[0x0][0x144];
--:-:-:-:1      MOV in,       c[0x0][0x148];
--:-:-:-:1      MOV blockDim, c[0x0][0x8];

// Get the first clock value
--:-:-:-:1      CS2R clock1, SR_CLOCKLO;

// Get the threadId and blockId
// Set the Read-After-Write dependency barrier 1 and 2
--:-:1:-:1      S2R tid, SR_TID.X;
// Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
--:-:2:-:2      S2R bid, SR_CTAID.X;


// Get the second clock value
// Wait on the depenedency barriers that were set in the prior instruction
// Stall 6 to allow CS2R time to complete before next instruction
// CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
// This stall count does not factor into the time calculation at all
03:-:-:-:6      CS2R clock2, SR_CLOCKLO;

// Take the difference of clocks
--:-:-:-:1      IADD clock1, clock2, -clock1;

// Setup our output addresses
// Stall your pipeline dependencies properly
03:-:-:-:1      XMAD tid, blockDim, bid, tid;
--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
--:-:-:Y:6      SHL  tid, tid, 0x2;

--:-:-:-:1      IADD clocks, clocks, tid;
--:-:-:-:1      IADD out,  out,  tid;

// Output the results.
// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
--:-:-:-:1      STG [clocks], clock1;
--:-:-:-:1      STG [out],    tid;
--:-:-:-:5      EXIT;

